Murine species trees & rates

knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)

library(ggplot2)
library(cowplot)
library(ggbeeswarm)
library(dplyr)
library(kableExtra)
library(tidyr)
library(ggtree)
library(phytools)
library(phangorn)
library(reshape2)
library(ggExtra)
library(ggrepel)
library(vroom)
library(ggdist)
source("C:/bin/core/r/design.r")
source("C:/bin/core/r/get_tree_info.r")
#source("C:/Users/grt814/bin/core/corelib/design.r")
#source("C:/Users/grt814/bin/core/get_tree_info.r")

#htmltools::includeHTML("../html-chunks/rmd_nav.html")

< Back to summary

1 Full coding species tree

#tree_type = "astral"
save_tree_fig = F

if(tree_type == "astral"){
  cat("188 species.\n11,775 coding loci.\nGene trees inferred with IQtree.\nSpecies tree inferred with ASTRAL (no branch lengths shown).\n")
}else if(tree_type == "concat"){
  cat("188 species.\n11,775 coding loci.\nGene trees inferred with IQtree.\nSpecies tree inferred by concatenation of all loci with IQtree.\n")
}
## 188 species.
## 11,775 coding loci.
## Gene trees inferred with IQtree.
## Species tree inferred by concatenation of all loci with IQtree.
tree_file = "../../data/trees/full_coding_iqtree_astral.cf.rooted.tree"
#tree_file = "../../data/trees/full_coding_iqtree_astral_PARED_gcf50.tree"
astral_tree = read.tree(tree_file)

tree_to_df_list = treeToDF(astral_tree)
tree_info_astral = tree_to_df_list[["info"]]
#tree_info_astral = treeToDF(astral_tree)
tree_info_astral = tree_info_astral %>% separate(label, c("astral", "gcf", "scf"), sep="/", remove=F)
tree_info_astral$astral[tree_info_astral$node.type=="tip"] = NA
tree_info_astral$astral = as.numeric(tree_info_astral$astral)
tree_info_astral$gcf = as.numeric(tree_info_astral$gcf)
tree_info_astral$scf = as.numeric(tree_info_astral$scf)

# Read astral tree data

concat_file = "../../data/trees/full_coding_iqtree_concat.cf.rooted.tree"
concat_tree = read.tree(concat_file)

tree_to_df_list = treeToDF(concat_tree)
tree_info_concat = tree_to_df_list[["info"]]
#tree_info_concat = treeToDF(concat_tree)
tree_info_concat = tree_info_concat %>% separate(label, c("bootstrap", "gcf", "scf"), sep="/", remove=F)
tree_info_concat$bootstrap[tree_info_concat$node.type=="tip"] = NA
tree_info_concat$bootstrap = as.numeric(tree_info_concat$bootstrap)
tree_info_concat$gcf = as.numeric(tree_info_concat$gcf)
tree_info_concat$scf = as.numeric(tree_info_concat$scf)
# Read concat tree data

#rf = RF.dist(concat_tree, astral_tree)
#nrf = RF.dist(concat_tree, astral_tree, normalize=T)

#treecomp = comparePhylo(concat_tree, astral_tree, plot=T)
# Stores node ids of common clades between trees!

#write.csv(tree_info, "../../data/trees/full-coding-concat-cf-rooted.csv", row.names=F)

if(tree_type == "astral"){
  tree_info = tree_info_astral
  rodent_tree = astral_tree
  xmax = 31
  # For unpared
  #xmax = 24
  # For pared
  iq_tree_labels = "../../data/trees/full_coding_iqtree_astral.cf.branch.rooted"
  cf_stat_file = "../../data/trees/full_coding_iqtree_astral.cf.stat"
  cf_rep_dir = "../../data/trees/astral-cf-reps/"
  delta_outfile = "../../data/trees/astral-delta.tab"
}else if(tree_type == "concat"){
  tree_info = tree_info_concat
  rodent_tree = concat_tree
  xmax = 0.125
  iq_tree_labels = "../../data/trees/full_coding_iqtree_concat.cf.branch.rooted"
  cf_stat_file = "../../data/trees/full_coding_iqtree_concat.cf.stat"
  cf_rep_dir = "../../data/trees/concat-cf-reps/"
  delta_outfile = "../../data/trees/concat-delta.tab"
}

cf_stats = read.table(cf_stat_file, header=T)
# The node/branch labels in R and IQtree differ. IQtree uses a nice, logical post-ordering
# of internal nodes while R does something random and assigns labels to tips as well. This 
# chunk matches those up for the delta analysis later

iq_tree = read.tree(iq_tree_labels)
iqtree_to_df_list = treeToDF(iq_tree)
iqtree_info = iqtree_to_df_list[["info"]]
# Read the IQ tree tree with branch labels in and parse with get_tree_info

#node_convert = matchNodes(tree_to_df_list[["labeled.tree"]], iqtree_to_df_list[["labeled.tree"]], method="descendants")

tree_info$iqtree.node = NA
# Add a column to the main tree table about IQ tree labels

for(i in 1:nrow(tree_info)){
  cur_node = tree_info[i,]$node
  iqtree_row = subset(iqtree_info, node==cur_node)
  iqtree_label = iqtree_row$label
  tree_info[i,]$iqtree.node = iqtree_label
}
# For every row in the main tree table, add in the IQ tree node label given that
# we've read the same tree in R and can use the node.labels

1.0.1 Species tree with branches colored by gene concordance factor

h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
# Colors

gcf_tree = ggtree(rodent_tree, size=0.8, ladderize=F, aes(color=tree_info$gcf)) +
  scale_color_continuous(name='gCF', low=l, high=h, limits=c(0,100)) +
  xlim(0, xmax) +
  geom_tiplab(color="#333333", fontface='italic', size=2) +
  theme(legend.position=c(0.05,0.9))
print(gcf_tree)

if(save_tree_fig){
  gcf_tree = gcf_tree + geom_text(aes(x=branch, label=ifelse(tree_info$node.type=="internal",as.character(node), ''), label.size=NA, fill="transparent"), size=2, vjust=-0.2)
  tree_outfile = paste("../../data/trees/", tree_type, "-gcf-tree-PARED-gcf50.pdf", sep="")
  ggsave(tree_outfile, gcf_tree, width=8, height=16, unit="in")
}
# gCF tree

1.0.2 Species tree with branches colored by site concordance factor

h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
# Colors

scf_tree = ggtree(rodent_tree, size=0.8, ladderize=F, aes(color=tree_info$scf)) +
  scale_color_continuous(name='sCF', low=l, high=h, limits=c(0,100)) +
  xlim(0, xmax) +
  geom_tiplab(color="#333333", fontface='italic', size=2) +
  theme(legend.position=c(0.05,0.9))
#geom_text(aes(label=rodent_data$support), hjust=-.1, color="#006ddb") +
#geom_nodepoint(color="#666666", alpha=0.85, size=4)
print(scf_tree)

#ggsave("../data/trees/scf-tree.pdf", scf_tree, width=8, height=16, unit="in")
# sCF tree

1.0.3 Gene vs site concordance factors colored by branch support

h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
# Colors

if(tree_type == "astral"){
  p = ggplot(tree_info, aes(x=gcf, y=scf, color=astral)) + 
    geom_point(size=2, alpha=0.5) +
    scale_color_continuous(name='Astral support', low=l, high=h, limits=c(0.8,1))
}else if(tree_type == "concat"){
  p = ggplot(tree_info, aes(x=gcf, y=scf, color=bootstrap)) + 
    geom_point(size=2, alpha=0.5) +
    scale_color_continuous(name='Bootstrap', low=l, high=h, limits=c(0,100))
}
p = p + bartheme() +
  theme(legend.title=element_text(size=12))
print(p)

1.0.4 Concordance factors vs. branch lengths

bl_gcf_p = ggplot(tree_info, aes(x=branch.length, y=gcf)) +
  geom_point(size=3, alpha=0.5) +
  #geom_text_repel(aes(label=ifelse(avg.ds>0.2|avg.dn>0.01,as.character(node),'')), show_guide=F) +
  #geom_smooth(method="lm", se=F, ) +
  xlab("Branch length") + 
  ylab("gCF per branch") + 
  bartheme()
  if(tree_type=="concat"){
    bl_gcf_p = bl_gcf_p + scale_x_continuous(limits=c(0,0.035))
  }
  #theme(legend.position="bottom") +
  #guides(colour = guide_legend(override.aes = list(alpha = 1)))
bl_gcf_p = ggExtra::ggMarginal(bl_gcf_p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=4), color="#666666")

bl_scf_p = ggplot(tree_info, aes(x=branch.length, y=scf)) +
  geom_point(size=3, alpha=0.5) +
  #geom_text_repel(aes(label=ifelse(avg.ds>0.2|avg.dn>0.01,as.character(node),'')), show_guide=F) +
  #geom_smooth(method="lm", se=F, ) +
  xlab("Branch length") + 
  ylab("sCF per branch") + 
  bartheme()
  if(tree_type=="concat"){
    bl_scf_p = bl_scf_p + scale_x_continuous(limits=c(0,0.035))
  }
  #theme(legend.position="bottom") +
  #guides(colour = guide_legend(override.aes = list(alpha = 1)))
bl_scf_p = ggExtra::ggMarginal(bl_scf_p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=5), color="#666666")

p = plot_grid(bl_gcf_p, bl_scf_p, ncol=2)
print(p)

2 Gene trees

The 11,774 gene trees contain varying numbers of taxa due to filtering on the alignment and filtering by IQtree for identical sequences.

2.0.1 Distribution of taxa per gene tree

gt_file = "../../data/trees/loci.treefile"
gt = read.tree(gt_file)
gt_data = data.frame("rfs"=c(), "num.tips"=c(), "rf.zeros"=c(), "num.tips.zeros"=c())
for(i in 1:length(gt)){
  cur_tips = gt[[i]]$tip.label

  pruned_tree = drop.tip(rodent_tree, rodent_tree$tip.label[-match(cur_tips, rodent_tree$tip.label)])
  
  cur_rf = RF.dist(pruned_tree, gt[[i]], normalize=T)
  
  if(cur_rf==0){
    gt_data = rbind(gt_data, data.frame("rfs"=cur_rf, "num.tips"=length(cur_tips), "rf.zeros"=cur_rf, "num.tips.zeros"=length(cur_tips)))
  }else{
    gt_data = rbind(gt_data, data.frame("rfs"=cur_rf, "num.tips"=length(cur_tips), "rf.zeros"=NA, "num.tips.zeros"=NA))
  }
}

p = ggplot(gt_data, aes(x=num.tips)) +
  geom_histogram(bins=50, fill=corecol(pal="wilke", numcol=1, offset=6), color="#666666") +
  #geom_quasirandom(size=2, width=0.25, alpha=0.25, color="#666666") +
  #geom_boxplot(outlier.shape=NA, alpha=0.75, width=0.5, color="#666666") +
  scale_y_continuous(expand=c(0,0)) +
  xlab("# of taxa") +
  ylab("# of gene trees") +
  bartheme()
print(p)

2.0.2 Tree distance (RF) distribution between gene trees and species tree

Since RF cannot handle missing taxa, the species tree is pruned for each gene tree to calculate Robinson-Foulds distance. We use the normalized metric since there are varying numbers of tips per gene tree.

p = ggplot(gt_data, aes(x=rfs)) +
  geom_histogram(bins=50, fill=corecol(pal="wilke", numcol=1, offset=4), color="#666666") +
  #geom_quasirandom(size=2, width=0.25, alpha=0.25, color="#666666") +
  #geom_boxplot(outlier.shape=NA, alpha=0.75, width=0.5, color="#666666") +
  scale_y_continuous(expand=c(0,0)) +
  xlab("Normalized RF") +
  ylab("# of gene trees") +
  bartheme()
print(p)

2.0.3 Delta

2.0.3.1 Brief introduction

For each lineage in the species tree with gCF < 95% we calculated the \(\Delta\) statistic (Huson et al. 2005). This statistic follows the same logic as the ABBA-BABA site patterns used to calculate D-statistics, but uses gene tree topologies instead of alignment sites. Briefly, a given branch in an unrooted tree is defined by a quartet of species groupings with two possible discordant topologies, \(D_1\) and \(D_2\) (see Figure 1 from Minh et al. 2020). Under assumptions that discordance is caused by ILS, both discordant topologies should be present in equal proportions. However, if introgression has occurred one discordant topology will appear more frequently than the other. \(\Delta\) is calculated for a branch as follows, using the frequency of each discordant topology (Vanderpool et al. 2020):

\[\Delta = \frac{D_1 - D_2}{D_1 + D_2}\]

This normalized \(\Delta\) calculation ensures that all values are scaled between 0 and 1, with larger values indicating a larger skew towards one topology, and a higher chance that introgression has occurred.

To test whether the observed \(\Delta\) values are skewed significantly from 0 to imply introgression, we performed concordance factor analysis on 1,000 bootstrap replicates of our inferred gene trees to generate a null distribution of \(\Delta\) values. We then calculated Z-scores and p-values and assessed significance for each branch at a threshold of 0.01.

2.0.3.2 Results

Nodes with p < 0.01:

cf_stats$delta = (abs(cf_stats$gDF1_N - cf_stats$gDF2_N)) / (cf_stats$gDF1_N + cf_stats$gDF2_N)
# Calculate the delta values on the actual data

iqtree_delta = select(cf_stats, ID, delta)
names(iqtree_delta) = c("iqtree.node", "delta")
tree_info = merge(x = tree_info, y = iqtree_delta, by = "iqtree.node", all.x=TRUE)
# Adding the delta values to the main tree info table

tree_info = tree_info[order(tree_info$node), ]
# Re-sort the data frame by R node order after the merge so the trees still work

low_cf_nodes = subset(cf_stats, gCF < 95)
# Get the low concordance factor nodes from the data to test with delta

delta_null = c()
for(i in 0:999){
  cur_rep_str = as.character(i)
  #print(cur_rep_str)
  while(nchar(cur_rep_str) < 4){
    cur_rep_str = paste("0", cur_rep_str, sep="")
  }
  # Handling the string of the rep
  
  cur_cf_file = paste(cf_rep_dir, "rep", cur_rep_str, ".cf.stat", sep="")
  cf_rep = read.table(cur_cf_file, header=T, fill=T)
  # Read the current reps cf file
  
  cf_rep$delta = (abs(cf_rep$gDF1_N - cf_rep$gDF2_N)) / (cf_rep$gDF1_N + cf_rep$gDF2_N)
  delta_null = c(delta_null, cf_rep$delta)
  # Calculate delta for this rep and save values in vector
}
# Read concordance factors from bootstrap samples of gene trees and calculate delta to generate
# null distribution

delta_null_df = data.frame("delta"=delta_null, y="duh")
delta_null_df = subset(delta_null_df, !is.nan(delta))
# Convert the delta values to a data frame for ggplot

delta_mu = mean(delta_null_df$delta, na.rm=T)
delta_sd = sd(delta_null_df$delta, na.rm=T)
# Calculate the mean and sd of the null distribution to get z-scores and p-values

delta_out = data.frame("node"=c(), "delta"=c(), "z-score"=c(), "p-value"=c())
# Initialize output data frame

tree_info$delta.sig = F
# Add a column to the tree info table about significant delta values

for(i in 1:nrow(low_cf_nodes)){
  row = low_cf_nodes[i,]
  z = (row$delta - delta_mu) / delta_sd
  p = pnorm(-abs(z))
  if(p < 0.01){
    print(paste(row$ID, row$delta, z, p, sep=" "))
    tree_info$delta.sig[tree_info$iqtree.node==row$ID] = T
    # Set the delta significant to TRUE in the main tree info table
  }
  delta_out = rbind(delta_out, data.frame("node"=row$ID, "delta"=row$delta, "z-score"=z, "p-value"=p))
}
## [1] "204 0.41615356754799 2.61636432115226 0.00444358382217448"
## [1] "216 0.485564304461942 3.23460722856629 0.00060905085758849"
## [1] "217 0.451743714517437 2.93336651585538 0.00167653901552935"
## [1] "261 0.511338697878566 3.46418029796737 0.000265924955371119"
## [1] "268 0.407725321888412 2.54129375811478 0.00552215416576702"
## [1] "291 0.40548137737175 2.52130689666168 0.005845991457981"
## [1] "294 0.484662576687117 3.22657552013053 0.000626405829970978"
## [1] "305 0.441067457375834 2.83827286681796 0.00226791942646311"
# Calculate z and p for each low gCF node in the species tree and save to output data frame

write.table(file=delta_outfile, delta_out, sep="\t", row.names=F)

delta_null_p = ggplot(delta_null_df, aes(x=delta)) +
  geom_histogram(color="#ececec", bins=50) +
  scale_x_continuous(limits=c(0,1)) +
  scale_y_continuous(expand=c(0,0)) +
  xlab("Delta") + 
  ylab("# nodes") +
  bartheme()

delta_actual_p = ggplot(low_cf_nodes, aes(x=delta)) +
  geom_histogram(fill="#920000", color="#ececec") +
  scale_x_continuous(limits=c(0,1)) +
  scale_y_continuous(expand=c(0,0)) +
  xlab("Delta") +
  ylab("# nodes") +
  bartheme()

delta_p = plot_grid(delta_null_p, delta_actual_p, ncol=2, labels=c("Null distribution", "Actual distribution"), label_size=12)
print(delta_p)

2.0.4 Branches with evidence for introgression

h = corecol(numcol=1, pal="wilke", offset=1)
l = corecol(numcol=1, offset=1)

intro_tree = ggtree(rodent_tree, size=0.8, ladderize=F, aes(color=tree_info$delta.sig)) +
  scale_color_manual(name="Significant Delta", labels=c("False", "True"), values=corecol(pal="trek", numcol=2)) +
  xlim(0, xmax) +
  geom_tiplab(color="#333333", fontface='italic', size=2) +
  theme(legend.position=c(0.05,0.9)) +
  geom_label(aes(x=branch, label=ifelse(tree_info$delta.sig,as.character(node),'')), label.size=NA, fill="transparent", show.legend=F, vjust=-0.1)
#geom_text(aes(label=rodent_data$support), hjust=-.1, color="#006ddb") +
#geom_nodepoint(color="#666666", alpha=0.85, size=4)
print(intro_tree)

2.0.5 CF and Delta

l = corecol(numcol=1, pal="wilke", offset=3)
h = corecol(numcol=1, offset=3)
# Colors

p = ggplot(tree_info, aes(x=gcf, y=scf, color=delta)) + 
  geom_point(size=2, alpha=0.5) +
  geom_text_repel(aes(label=ifelse(delta.sig,as.character(node),'')), show_guide=F, min.segment.length=0) +
  scale_color_continuous(name='Delta', low=l, high=h) +
  bartheme() +
  theme(legend.title=element_text(size=12))

print(p)

< Back to summary

3 Average rates per gene with basic model fit (mg94-local)

We ran each of the 11,775 coding loci through HyPhy’s standard MG94 fit with the -local option to estimate a rate for each branch in the input tree.

For input trees we used the gene tree estimated from each individual alignment to reduce false inferences of substitutions that result from tree misspecification.

rates_file = "../../data/rates/full-coding-slac.csv.gz"
#mg94_local_file = "../../data/rates/full-coding-mg94-local.csv"
rates = vroom(rates_file, comment="#")

rates$dn = rates$N / rates$EN
rates$ds = rates$S / rates$ES
rates$dn.ds = rates$dn / rates$ds

gene_rates = group_by(rates, file) %>% summarize(dn=mean(dn, na.rm=T), ds=mean(ds, na.rm=T), dn.ds=mean("dn/ds", na.rm=T))

p = ggplot(subset(gene_rates, ds < 0.05), aes(x=ds, y=dn)) +
  geom_point(size=2, alpha=0.2, color="#333333") +
  #geom_smooth(method="lm", se=F, ) +
  xlab("Avg. dS per gene") + 
  ylab("Avg. dN per gene") + 
  bartheme()
p = ggExtra::ggMarginal(p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1), color="#666666")
print(p)

ds_p = ggplot(subset(gene_rates, ds < 0.05), aes(x=ds)) +
  geom_histogram(bins=50, fill=corecol(pal="wilke", numcol=1), color="#666666") +
  scale_y_continuous(expand=c(0,0)) +
  xlab("dS") +
  ylab("# of genes") +
  bartheme()
print(ds_p)
# Distribution of dS when using concatenated tree
dn_p = ggplot(gene_rates, aes(x=dn)) +
  geom_histogram(bins=50, fill=corecol(pal="wilke", numcol=1), color="#666666") +
  scale_y_continuous(expand=c(0,0)) +
  xlab("dN") +
  ylab("# of genes") +
  bartheme()
print(dn_p)
# Distribution of dS when using gene trees
ds_filter_level = quantile(gene_rates$ds, 0.98)
#ds_filter_level = 0.03

ds_filter = subset(gene_rates, ds > ds_filter_level)$file
# Get a list of genes to filter out in subsequent analyses based on dS

print(paste("Removing", length(ds_filter), "genes with dS above", ds_filter_level, "from subsequent analyses."))
## [1] "Removing 236 genes with dS above 0.0338467469553527 from subsequent analyses."
#write.csv(ds_filter, file="../../data/rates/full-coding-mg94-local-ds-filter-0.95quant.csv", row.names=F)

< Back to summary

4 Average rates per branch with basic model fit (mg94-local)

Because we used the gene trees, in order to quantify rates on species tree branches we first needed to check whether branches in the species tree exist in a given gene tree.

This resulted in three categories for a species tree branch in a given gene tree:

  1. Full clade: All species in the clade that descends from the branch in the species tree are present as a monophyletic split in the gene tree.
  2. Parital clade: Not all species in the clade that descends from the branch in the species tree are present in the gene tree, but the ones that are present form a monophyletic split.
  3. Discordant/missing clade: Either the species in the clade that descends from the branch in the species tree do not form a monophyletic split in this gene tree (discordant) or ALL species in this clade are missing from this gene tree (missing)

Average rates are then calculated as (for dS, for example):

\[\frac{\sum_{i=1}^{n}\text{branch dS}_i}{n}\]

Where:

  • \(n = \text{# full clade genes} + \text{# partial clade genes}\) for this branch

4.0.1 Species tree branch presence/absence per gene

if(tree_type == "astral"){
  #tree_rates = read.csv("../../data/rates/full-coding-astral-cf-rooted-rates.csv", header=T)
  tree_rates_w_root = read.csv("../../data/rates/full-coding-astral-slac-branch-rates.csv", header=T)
}else if(tree_type == "concat"){
  tree_rates_w_root = read.csv("../../data/rates/full-coding-concat-slac-branch-rates.csv", header=T)
}
# Read the branch average rate data

uniq_info_cols = names(tree_info)[!(names(tree_info) %in% names(tree_rates_w_root))] # get non common names
uniq_info_cols = c(uniq_info_cols,"clade") # appending key parameter
# Get a list of columns from the tree_info df to join to the tree rates df

tree_rates_w_root = tree_rates_w_root %>% left_join((tree_info %>% select(uniq_info_cols)), by="clade")
# Select the columns from tree_info and join to tree_rates, merging by clade
# https://stackoverflow.com/a/61628157

tree_rates_w_root = tree_rates_w_root[order(tree_rates_w_root$node), ]
# Re-order the rows by the R node

tree_rates = subset(tree_rates_w_root, node.type != "root")

full_clade = select(tree_rates, clade, node.type, num.genes.full)
full_clade$label = "Full clade"
names(full_clade)[3] = "num.genes"

partial_clade = select(tree_rates, clade, node.type, num.genes.partial)
partial_clade$label = "Partial clade"
names(partial_clade)[3] = "num.genes"

descendant_counted = select(tree_rates, clade, node.type, num.genes.descendant.counted)
descendant_counted$label = "Descendant counted"
names(descendant_counted)[3] = "num.genes"

discordant_clade = select(tree_rates, clade, node.type, num.genes.discordant)
discordant_clade$label = "Discordant clade"
names(discordant_clade)[3] = "num.genes"

no_clade = select(tree_rates, clade, node.type, num.genes.missing)
no_clade$label = "Missing clade"
names(no_clade)[3] = "num.genes"

clade_counts = rbind(full_clade, partial_clade, descendant_counted, discordant_clade, no_clade)
# Convert branch categories to long format

clade_counts$label = factor(clade_counts$label, levels=c("Full clade", "Partial clade", "Descendant counted", "Discordant clade", "Missing clade"))

branch_counts = ggplot(clade_counts, aes(x=label, y=num.genes, group=label, color=node.type)) +
  geom_quasirandom(size=2, width=0.25, alpha=0.25) +
  geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
  ylab("# of genes") +
  xlab("Species tree\nbranch classification") +
  bartheme() +
  theme(axis.text.x = element_text(angle=25, hjust=1)) +
  guides(colour=guide_legend(override.aes=list(alpha=1)))
print(branch_counts)

4.0.2 Distribution of genes per branch

tree_rates$num.genes.present = tree_rates$num.genes.full + tree_rates$num.genes.partial

presence_dist = ggplot(tree_rates, aes(x=num.genes.present)) +
  geom_histogram(bins=50, fill=corecol(pal="wilke", numcol=1, offset=1), color="#ececec") +
  scale_y_continuous(expand=c(0,0)) +
  xlab("Genes per branch") +
  ylab("# of branches") +
  bartheme()
print(presence_dist)

t_data = data.frame("Stat"=c("Branch with most genes", "Branch with fewest genes", "Median genes per branch", "Mean genes per branch"), 
                    "Branch label"=c(tree_rates$clade[tree_rates$num.genes.present==max(tree_rates$num.genes.present, na.rm=T)][2],
                                     tree_rates$node[tree_rates$num.genes.present==min(tree_rates$num.genes.present, na.rm=T)][2],
                                     "NA", "NA"), 
                    "Num genes"=c(tree_rates$num.genes.present[tree_rates$num.genes.present==max(tree_rates$num.genes.present, na.rm=T)][2],
                                  tree_rates$num.genes.present[tree_rates$num.genes.present==min(tree_rates$num.genes.present, na.rm=T)][2],
                                  median(tree_rates$num.genes.present, na.rm=T),
                                  mean(tree_rates$num.genes.present, na.rm=T)
                                  )
                    )


t_data %>% kable(caption="Branch statistics") %>% kable_styling(bootstrap_options=c("striped", "condensed", "responsive"), full_width=F)
Branch statistics
Stat Branch.label Num.genes
Branch with most genes NA NA
Branch with fewest genes NA NA
Median genes per branch NA 7446.500
Mean genes per branch NA 6727.524

These measures are highly correlated with gene concordance factors:

4.0.3 Correlation between gCF and the % of genes that contain the descending clade for each species tree branch

tree_rates$clade.perc = (tree_rates$num.genes.full + tree_rates$num.genes.partial) / (tree_rates$num.genes.full + tree_rates$num.genes.partial + tree_rates$num.genes.descendant.counted + tree_rates$num.genes.discordant + tree_rates$num.genes.missing)

p = ggplot(tree_rates, aes(x=gcf, y=clade.perc)) +
  geom_point(size=2, alpha=0.4, color="#333333") +
  geom_smooth(method="lm", se=F, linetype="dashed", color="#920000") +
  xlab("gCF") + 
  ylab("% of genes with clade present") + 
  bartheme() +
  theme(legend.position="none")
print(p)

4.0.4 Averaging rates across branches

To control for the fact that different branches will be present in different numbers of genes, we can calculate average rates per branch by summing the raw counts of the number of sites and the number of substitutions across all genes in which that branch is present as a full or partial clade.

For a given branch, \(b\):

\[dN_b = \frac{\sum_{g \in G_b}N_g}{\sum_{g \in G_b}EN_g}\]

and

\[dS_b = \frac{\sum_{g \in G_b}S_g}{\sum_{g \in G_b}ES_g}\]

where \(G_b\) is the set of genes that contain branch \(b\). Then,

\[\omega_b = \frac{dN_b}{dS_b}\]

This results in the following distributions for average rates across branches (green lines indicating 95th percentile of each rate):

ds_95_perc = quantile(tree_rates$dS, 0.95, na.rm=T)
dn_95_perc = quantile(tree_rates$dN, 0.95, na.rm=T)

p = ggplot(tree_rates, aes(x=dS, y=dN, color=node.type)) +
  geom_point(size=2, alpha=0.2) +
  geom_text_repel(aes(label=ifelse(dS>ds_95_perc | dN>dn_95_perc, as.character(node), '')), show_guide=F) +
  #(aes(label=ifelse(avg.dN>dn_95_perc, as.character(node), '')), show_guide=F) +
  geom_vline(xintercept=ds_95_perc, linetype="dashed", color=corecol(numcol=1, offset=6)) +
  geom_hline(yintercept=dn_95_perc, linetype="dashed", color=corecol(numcol=1, offset=6)) +
  #geom_smooth(method="lm", se=F, ) +
  xlab("dS per branch") + 
  ylab("dN per branch") + 
  bartheme() +
  theme(legend.position="bottom") +
  guides(colour = guide_legend(override.aes = list(alpha = 1)))
p = ggExtra::ggMarginal(p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=1), color="#666666")
print(p)

tree_rates$ds.outlier = ifelse(tree_rates$dS>ds_95_perc,tree_rates$node,'')
tree_rates$dn.outlier = ifelse(tree_rates$dN>dn_95_perc,tree_rates$node,'')

4.0.5 The tree with branches colored by dS and outliers labeled:

h = corecol(numcol=1, pal="wilke", offset=1)
l = corecol(numcol=1, offset=1)

rate_tree = ggtree(rodent_tree, size=0.8, ladderize=F, aes(color=log(tree_rates_w_root$dS))) +
  scale_color_continuous(name='log dS', low=l, high=h) +
  xlim(0, xmax) +
  geom_tiplab(color="#333333", fontface='italic', size=2) +
  theme(legend.position=c(0.05,0.9)) +
  geom_label(aes(x=branch, label=ifelse(tree_rates_w_root$dS>ds_95_perc,as.character(node),'')), label.size=NA, fill="transparent")
  #geom_text(aes(label=node), hjust=-.1, color="#006ddb")
#geom_nodepoint(color="#666666", alpha=0.85, size=4)
print(rate_tree)

4.0.6 The tree with branches colored by dN and outliers labeled:

h = corecol(numcol=1, pal="wilke", offset=1)
l = corecol(numcol=1, offset=1)

rate_tree = ggtree(rodent_tree, size=0.8, ladderize=F, aes(color=log(tree_rates_w_root$dN))) +
  scale_color_continuous(name='log dN', low=l, high=h) +
  xlim(0, xmax) +
  geom_tiplab(color="#333333", fontface='italic', size=2) +
  theme(legend.position=c(0.05,0.9)) +
  geom_label(aes(x=branch, label=ifelse(tree_rates_w_root$dN > dn_95_perc, as.character(node) ,'')), label.size=NA, fill="transparent")
#geom_text(aes(label=rodent_data$support), hjust=-.1, color="#006ddb") +
#geom_nodepoint(color="#666666", alpha=0.85, size=4)
print(rate_tree)

4.0.7 Average dN/dS per branch

dnds_p = ggplot(tree_rates, aes(x=dNdS)) +
  geom_histogram(bins=50, fill=corecol(pal="wilke", numcol=1), color="#666666") +
  scale_y_continuous(expand=c(0,0)) +
  xlab("dN/dS") +
  ylab("# of branches") +
  bartheme()
print(dnds_p)

# Distribution of dN/dS when using gene trees

dnds_95_perc = quantile(tree_rates$dNdS, 0.95, na.rm=T)
tree_rates$dnds.outlier = ifelse(tree_rates$dNdS>dnds_95_perc,tree_rates$node,'')

4.0.8 The tree with branches colored by dN/dS:

h = corecol(numcol=1, pal="wilke", offset=1)
l = corecol(numcol=1, offset=1)

rate_tree = ggtree(rodent_tree, size=0.8, ladderize=F, aes(color=log(tree_rates_w_root$dNdS))) +
  scale_color_continuous(name='log dN/dS', low=l, high=h) +
  xlim(0, xmax) +
  geom_tiplab(color="#333333", fontface='italic', size=2) +
  theme(legend.position=c(0.05,0.9)) +
  geom_label(aes(x=branch, label=ifelse(tree_rates_w_root$dNdS>dnds_95_perc ,as.character(node), '')), label.size=NA, fill="transparent")
#geom_text(aes(label=rodent_data$support), hjust=-.1, color="#006ddb") +
#geom_nodepoint(color="#666666", alpha=0.85, size=4)
print(rate_tree)

5 Rates and discordance

5.0.1 dS vs. concordance factors

Only branches with avg. dS < 0.05

ds_gcf_p = ggplot(subset(tree_rates, node.type!="ROOT" & dS < ds_95_perc), aes(x=dS, y=gcf)) +
  geom_point(size=3, alpha=0.5) +
  #geom_text_repel(aes(label=ifelse(avg.ds>0.2|avg.dn>0.01,as.character(node),'')), show_guide=F) +
  #geom_smooth(method="lm", se=F, ) +
  xlab("dS per branch") + 
  ylab("gCF per branch") + 
  bartheme()
  #theme(legend.position="bottom") +
  #guides(colour = guide_legend(override.aes = list(alpha = 1)))
ds_gcf_p = ggExtra::ggMarginal(ds_gcf_p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=4), color="#666666")

ds_scf_p = ggplot(subset(tree_rates, node.type!="ROOT" & dS < ds_95_perc), aes(x=dS, y=scf)) +
  geom_point(size=3, alpha=0.5) +
  #geom_text_repel(aes(label=ifelse(avg.ds>0.2|avg.dn>0.01,as.character(node),'')), show_guide=F) +
  #geom_smooth(method="lm", se=F, ) +
  xlab("dS per branch") + 
  ylab("sCF per branch") + 
  bartheme()
  #theme(legend.position="bottom") +
  #guides(colour = guide_legend(override.aes = list(alpha = 1)))
ds_scf_p = ggExtra::ggMarginal(ds_scf_p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=5), color="#666666")

p = plot_grid(ds_gcf_p, ds_scf_p, ncol=2)
print(p)

5.0.2 dN vs. concordance factors

Only branches with avg. dN < 0.01

dn_gcf_p = ggplot(subset(tree_rates, node.type!="ROOT" & dN < dn_95_perc), aes(x=dN, y=gcf)) +
  geom_point(size=3, alpha=0.5) +
  #geom_text_repel(aes(label=ifelse(avg.ds>0.2|avg.dn>0.01,as.character(node),'')), show_guide=F) +
  #geom_smooth(method="lm", se=F, ) +
  xlab("dN per branch") + 
  ylab("gCF per branch") + 
  bartheme()
  #theme(legend.position="bottom") +
  #guides(colour = guide_legend(override.aes = list(alpha = 1)))
dn_gcf_p = ggExtra::ggMarginal(dn_gcf_p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=4), color="#666666")

dn_scf_p = ggplot(subset(tree_rates, node.type!="ROOT" & dN < dn_95_perc), aes(x=dN, y=scf)) +
  geom_point(size=3, alpha=0.5) +
  #geom_text_repel(aes(label=ifelse(avg.ds>0.2|avg.dn>0.01,as.character(node),'')), show_guide=F) +
  #geom_smooth(method="lm", se=F, ) +
  xlab("dN per branch") + 
  ylab("sCF per branch") + 
  bartheme()
  #theme(legend.position="bottom") +
  #guides(colour = guide_legend(override.aes = list(alpha = 1)))
dn_scf_p = ggExtra::ggMarginal(dn_scf_p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=5), color="#666666")

p = plot_grid(dn_gcf_p, dn_scf_p, ncol=2)
print(p)

5.0.3 dN/dS vs. concordance factors

Only branches with avg. dN/dS < 0.5

dnds_gcf_p = ggplot(subset(tree_rates, node.type!="ROOT" & dNdS < 0.5), aes(x=dNdS, y=gcf)) +
  geom_point(size=3, alpha=0.5) +
  #geom_text_repel(aes(label=ifelse(avg.ds>0.2|avg.dn>0.01,as.character(node),'')), show_guide=F) +
  #geom_smooth(method="lm", se=F, ) +
  xlab("dN/dS per branch") + 
  ylab("gCF per branch") + 
  bartheme()
  #theme(legend.position="bottom") +
  #guides(colour = guide_legend(override.aes = list(alpha = 1)))
dnds_gcf_p = ggExtra::ggMarginal(dnds_gcf_p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=4), color="#666666")

dnds_scf_p = ggplot(subset(tree_rates, node.type!="ROOT" & dNdS < 0.5), aes(x=dNdS, y=scf)) +
  geom_point(size=3, alpha=0.5) +
  #geom_text_repel(aes(label=ifelse(avg.ds>0.2|avg.dn>0.01,as.character(node),'')), show_guide=F) +
  #geom_smooth(method="lm", se=F, ) +
  xlab("dN/dS per branch") + 
  ylab("sCF per branch") + 
  bartheme()
  #theme(legend.position="bottom") +
  #guides(colour = guide_legend(override.aes = list(alpha = 1)))
dnds_scf_p = ggExtra::ggMarginal(dnds_scf_p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=5), color="#666666")

p = plot_grid(dnds_gcf_p, dnds_scf_p, ncol=2)
print(p)

< Back to summary

6 Basic rate and phenotype correlations with tip branches

pheno = read.csv("../../data/phenotype-data/combined-phenotype-data.csv", header=T, comment.char="#")
tips = subset(tree_rates, node.type=="tip")
names(tips)[2] = "sample"
pheno_rates = merge(pheno, tips, by="sample")

pheno_rates = select(pheno_rates, sample, Adult_Mass.g., Total_Length.mm., Head.Body_Length.mm., Tail_Length.mm., Hind_Foot_Length.mm., Relative_Tail_Length, Relative_Hind_Foot_Length, dN, dS, dNdS)

pheno_rates_long = melt(pheno_rates, id.vars=c("sample", "dN", "dS", "dNdS"))

#pheno_rates_long = gather(pheno_rates, sample, value, Adult_Mass.g., Total_Length.mm., Head.Body_Length.mm., Tail_Length.mm., Hind_Foot_Length.mm., Relative_Tail_Length, Relative_Hind_Foot_Length)

6.0.1 Avg dS per tip vs phenotype

p = ggplot(pheno_rates_long, aes(x=log(value), y=log(dS))) +
  geom_point(size=2, alpha=0.2, color="#333333") +
  geom_smooth(method="lm", se=F, linetype="dashed", color=corecol(numcol=1, pal="wilke", offset=2)) +
  xlab("log avg. trait value per tip") +
  ylab("log dS per tip") +
  facet_wrap(~variable, scales="free_x") +
  bartheme()
print(p)

6.0.2 Avg dN per tip vs phenotype

p = ggplot(pheno_rates_long, aes(x=log(value), y=log(dN))) +
  geom_point(size=2, alpha=0.2, color="#333333") +
  geom_smooth(method="lm", se=F, linetype="dashed", color=corecol(numcol=1, pal="wilke", offset=2)) +
  xlab("log avg. trait value per tip") +
  ylab("log dN per tip") +
  facet_wrap(~variable, scales="free_x") +
  bartheme()
print(p)

6.0.3 Avg dN/dS per tip vs phenotype

p = ggplot(pheno_rates_long, aes(x=log(value), y=log(dNdS))) +
  geom_point(size=2, alpha=0.2, color="#333333") +
  geom_smooth(method="lm", se=F, linetype="dashed", color=corecol(numcol=1, pal="wilke", offset=2)) +
  xlab("log avg. trait value per tip") +
  ylab("log dN/dS per tip") +
  facet_wrap(~variable, scales="free_x") +
  bartheme()
print(p)

< Back to summary